# package
library(ape)
## Warning: package 'ape' was built under R version 3.5.2
library(clusterProfiler)
## 
## clusterProfiler v3.10.1  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#bring in phase1 allele frequencies
phase1.all.freqs<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.all.freqs.csv")
#make column for id
phase1.all.freqs$id<-paste(phase1.all.freqs$chrom,phase1.all.freqs$POS)
#bring in dataframe with locality and environmental data for each individual phase1
phase1.alldata<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.allvariables.csv")
#subset to only the variables we need (lat,long,hum,temp,precip)
meta<-phase1.alldata[,c(16,17,18,20,22)]
#subset only the unique lat/longs
metapops<-unique(meta)
#sort by latitude to match the order of the allele frequency files
sort.pops.phase1 <-metapops[order(metapops$latitude),]

## read in lfmm outlier table
lfmm.outlier.table <- read.csv("~/Downloads/vetted.lfmm.prec.env.csv")[,2:3]
#add separate chrom and pos columns
lfmm.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,1]
lfmm.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,2]

## read in bayescan outlier table
bayescan.outlier.table <- read.csv("~/Downloads/vetted.bayescenv.prec.env.csv")[,2:3]
#add separate chrom and pos columns
bayescan.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,1]
bayescan.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,2]

## read in genomic regions and attributes file
genomic_location <- read.gff("~/Downloads/Anopheles_gambiae.AgamP4.47.chr.gff3.gz")
#keep only regions identified as "gene"
gen_regions <- genomic_location[genomic_location$type == "gene", ] # subset to only gene regions

#pull all genes queried
all.gene.id<-data.frame(do.call('rbind', strsplit(as.character(gen_regions$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP004677", "biotype=protein_coding",
## "description=methylenetetrahydrofolate dehydrogenase(NAD+) / 5%2C10-
## methenyltetrahydrofolate [Source:VB Community Annotation]", : number of columns
## of result is not a multiple of vector length (arg 1)
all.genes<-data.frame(do.call('rbind', strsplit(as.character(all.gene.id),':',fixed=TRUE)))[,2]
#write.table(all.genes, "~/Downloads/all.genes.anopheles.gambiae.txt", row.names = F, col.names = F, quote = F)
#define chroms
chroms <- c("2R", "2L", "3R", "3L","X") # chromosomes of interest

#initialize gene.list for lfmm
lfmm.gene.list<-data.frame()
#fill gene.list by matching location of genes of interest in gene feature regions by chromosomes
for (i in 1:length(chroms)){
  genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
  pos <- as.numeric(as.character(lfmm.outlier.table[lfmm.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
  for (w in 1:length(pos)) {
    if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
    lfmm.gene.list<-rbind(lfmm.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
    }
  }
  cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
## 
## chromosome 2 of 5 finished
## 
## chromosome 3 of 5 finished
## 
## chromosome 4 of 5 finished
## 
## chromosome 5 of 5 finished
#pull out gene ID as its own column
lfmm.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP001824", "biotype=protein_coding",
## "gene_id=AGAP001824", : number of columns of result is not a multiple of vector
## length (arg 1)
lfmm.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$gene.id),':',fixed=TRUE)))[,2]

#make SNP ID column
lfmm.gene.list$SNPid<-paste(lfmm.gene.list$seqid,lfmm.gene.list$`pos[w]`)

#subset
lfmm.gene.list<-lfmm.gene.list[,c("SNPid","gene")]

#number of candidate snps
nrow(lfmm.outlier.table)
## [1] 5594
#calculate percentage of lfmm candidate SNPs that were in a gene
nrow(lfmm.gene.list)/nrow(lfmm.outlier.table)
## [1] 0.5861637
#calc % of the genome covered by the genes we are searching
sum(gen_regions$end-gen_regions$start)/278268413 
## [1] 0.3935553
table(lfmm.gene.list$gene)[table(lfmm.gene.list$gene) >10]
## 
## AGAP000519 AGAP000801 AGAP001022 AGAP001023 AGAP001027 AGAP001029 AGAP001035 
##         36         18         53         11         26         28         25 
## AGAP001038 AGAP001039 AGAP001043 AGAP001046 AGAP001047 AGAP001048 AGAP001052 
##        104         32        117         36         16        139        203 
## AGAP001053 AGAP001061 AGAP001064 AGAP001065 AGAP001068 AGAP001069 AGAP001070 
##         69        330         29         18         32         62        146 
## AGAP001073 AGAP001076 AGAP001082 AGAP001083 AGAP001084 AGAP001090 AGAP001091 
##        223         43         55         14         72         52         84 
## AGAP001093 AGAP001094 AGAP004707 AGAP007731 AGAP007732 AGAP007736 AGAP007761 
##         80         36         11         15         35        118         17 
## AGAP010293 AGAP010295 AGAP010310 AGAP010311 AGAP010312 AGAP010313 AGAP010314 
##         15         51         25         21         39         20         34 
## AGAP013021 AGAP013341 AGAP029563 
##        163         19         30
hist(table(lfmm.gene.list$gene))

#write.table(unique(lfmm.gene.list$gene), "~/Downloads/lfmm.prec.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(lfmm.gene.list, "~/Downloads/lfmm.prec.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#initialize gene.list for bayescan
bayescan.gene.list<-data.frame()
#fill gene.list
for (i in 1:length(chroms)){
  genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
  pos <- as.numeric(as.character(bayescan.outlier.table[bayescan.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
  for (w in 1:length(pos)) {
    if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
      bayescan.gene.list<-rbind(bayescan.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
    }
  }
  cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
## 
## chromosome 2 of 5 finished
## 
## chromosome 3 of 5 finished
## 
## chromosome 4 of 5 finished
## 
## chromosome 5 of 5 finished
#pull out gene ID as its own column
bayescan.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP001824", "biotype=protein_coding",
## "gene_id=AGAP001824", : number of columns of result is not a multiple of vector
## length (arg 1)
bayescan.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$gene.id),':',fixed=TRUE)))[,2]

#make SNP ID column
bayescan.gene.list$SNPid<-paste(bayescan.gene.list$seqid,bayescan.gene.list$`pos[w]`)

#subset
bayescan.gene.list<-bayescan.gene.list[,c("SNPid","gene")]

#number of candidate snps
nrow(bayescan.outlier.table)
## [1] 1261
#calculate percentage of bayescan candidate SNPs that were in a gene
nrow(bayescan.gene.list)/nrow(bayescan.outlier.table)
## [1] 0.5099128
#calc % of the genome covered by the genes we are searching
sum(gen_regions$end-gen_regions$start)/278268413 
## [1] 0.3935553
table(bayescan.gene.list$gene)[table(bayescan.gene.list$gene) >10]
## 
## AGAP000893 AGAP000915 AGAP000940 AGAP000961 AGAP000962 AGAP000969 AGAP000974 
##         19         19         15         22         48         11         13 
## AGAP000984 AGAP000998 AGAP001015 AGAP001023 AGAP001052 AGAP001094 AGAP001674 
##         37         16         24         12         11         12         11 
## AGAP001824 
##         47
hist(table(bayescan.gene.list$gene))

#write.table(unique(bayescan.gene.list$gene), "~/Downloads/bayescan.prec.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(bayescan.gene.list, "~/Downloads/bayescan.prec.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#look into how many genes overlap between the methods
intersect(unique(lfmm.gene.list$gene),unique(bayescan.gene.list$gene))
##  [1] "AGAP001824" "AGAP029626" "AGAP002916" "AGAP029619" "AGAP001674"
##  [6] "AGAP004707" "AGAP006347" "AGAP006590" "AGAP029578" "AGAP007736"
## [11] "AGAP007754" "AGAP007761" "AGAP007762" "AGAP007763" "AGAP010816"
## [16] "AGAP010817" "AGAP000844" "AGAP000847" "AGAP029672" "AGAP000962"
## [21] "AGAP000974" "AGAP001022" "AGAP001023" "AGAP001025" "AGAP001029"
## [26] "AGAP001031" "AGAP001034" "AGAP001035" "AGAP001036" "AGAP001037"
## [31] "AGAP001038" "AGAP001039" "AGAP001040" "AGAP001043" "AGAP001046"
## [36] "AGAP001048" "AGAP001052" "AGAP001053" "AGAP013021" "AGAP001061"
## [41] "AGAP001064" "AGAP001065" "AGAP001069" "AGAP001070" "AGAP001073"
## [46] "AGAP001076" "AGAP001082" "AGAP001083" "AGAP001084" "AGAP001091"
## [51] "AGAP001092" "AGAP001093" "AGAP001094"

LFMM overrep testing

#overrepresentation testing
#read in each GO term to gene map (col1=geneID,col2=GOterm)
mart.export<-read.table("~/Downloads/mart_export (3).txt", sep = "\t", header = T)

#calculate overrepresentation for lfmm prec
lfmm.prec.enriched<-enricher(gene = lfmm.gene.list$gene,
                 universe = all.genes,
                   pAdjustMethod = "fdr",
                 TERM2GENE= mart.export[,c(2,1)]
)
result<-lfmm.prec.enriched@result

lfmm.prec.enriched.sig<-result[result$p.adjust <= .05,]
lfmm.prec.enriched.sig
#there are no significantly enriched GO terms
result[1:5,]
#write.table(lfmm.prec.enriched.sig[,c("ID","geneID")], file="~/Downloads/lfmm.prec.go.overrep.txt",
#            row.names = F, col.names = F, quote = F)

Bayescenv overrep testing

#calculate overrepresentation for bayescan prec
bayescan.prec.enriched<-enricher(gene = bayescan.gene.list$gene,
                            universe = all.genes,
                            pAdjustMethod = "fdr",
                            TERM2GENE= mart.export[,c(2,1)]
                            )
result<-bayescan.prec.enriched@result

#retain significantly enriched GO terms
bayescan.prec.enriched.sig<-result[result$p.adjust <= .05,]

#check out significantly overrepresented GO terms
bayescan.prec.enriched.sig
#get a vector of genes we found SNPs in for each enriched GO term
#bayescan
go.0042391<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0042391"]),'/'))
go.0007015<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0007015"]),'/'))
go.0050877<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0050877"]),'/'))
go.0045202<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0045202"]),'/'))
go.0006811<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0006811"]),'/'))
go.0038023<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0038023"]),'/'))
go.0045211<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0045211"]),'/'))
go.0005216<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0005216"]),'/'))
go.0007268<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0007268"]),'/'))
go.0036459<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0036459"]),'/'))
go.0043005<-unlist(strsplit(as.character(bayescan.prec.enriched.sig$geneID[bayescan.prec.enriched.sig$ID == "GO:0043005"]),'/'))

plot all SNPs associated w/ GO:0042391

bayescan.gene.list.go.0042391<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0042391,]
#plot all SNPs associated w/ GO:0042391
sig.pos<-bayescan.gene.list.go.0042391$SNPid
gene<-droplevels(bayescan.gene.list.go.0042391$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0007015

bayescan.gene.list.go.0007015<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0007015,]
#plot all SNPs associated w/ GO:0007015
sig.pos<-bayescan.gene.list.go.0007015$SNPid
gene<-droplevels(bayescan.gene.list.go.0007015$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0050877

bayescan.gene.list.go.0050877<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0050877,]
#plot all SNPs associated w/ GO:0050877
sig.pos<-bayescan.gene.list.go.0050877$SNPid
gene<-droplevels(bayescan.gene.list.go.0050877$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0045202

bayescan.gene.list.go.0045202<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0045202,]
#plot all SNPs associated w/ GO:0045202
sig.pos<-bayescan.gene.list.go.0045202$SNPid
gene<-droplevels(bayescan.gene.list.go.0045202$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0006811

bayescan.gene.list.go.0006811<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0006811,]
#plot all SNPs associated w/ GO:0006811
sig.pos<-bayescan.gene.list.go.0006811$SNPid
gene<-droplevels(bayescan.gene.list.go.0006811$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0038023

bayescan.gene.list.go.0038023<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0038023,]
#plot all SNPs associated w/ GO:0038023
sig.pos<-bayescan.gene.list.go.0038023$SNPid
gene<-droplevels(bayescan.gene.list.go.0038023$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0045211

bayescan.gene.list.go.0045211<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0045211,]
#plot all SNPs associated w/ GO:0045211
sig.pos<-bayescan.gene.list.go.0045211$SNPid
gene<-droplevels(bayescan.gene.list.go.0045211$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0005216

bayescan.gene.list.go.0005216<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0005216,]
#plot all SNPs associated w/ GO:0005216
sig.pos<-bayescan.gene.list.go.0005216$SNPid
gene<-droplevels(bayescan.gene.list.go.0005216$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0007268

bayescan.gene.list.go.0007268<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0007268,]
#plot all SNPs associated w/ GO:0007268
sig.pos<-bayescan.gene.list.go.0007268$SNPid
gene<-droplevels(bayescan.gene.list.go.0007268$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0036459

bayescan.gene.list.go.0036459<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0036459,]
#plot all SNPs associated w/ GO:0036459
sig.pos<-bayescan.gene.list.go.0036459$SNPid
gene<-droplevels(bayescan.gene.list.go.0036459$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0043005

bayescan.gene.list.go.0043005<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0043005,]
#plot all SNPs associated w/ GO:0043005
sig.pos<-bayescan.gene.list.go.0043005$SNPid
gene<-droplevels(bayescan.gene.list.go.0043005$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

Best places to find info:

uniprot database:

uniprot.org/

UC Irvine AGAM Gene Expression Profile

http://www.angagepuci.bio.uci.edu/

AG1000G selection atlas

https://malariagen.github.io/agam-selection-atlas/0.1-alpha3/index.html